﻿
#include "stdafx.h"

// mask values
static const QWORD MASKB = 0x001F001F001F001F;
static const QWORD MASKG = 0x07E007E007E007E0;
static const QWORD MASKSHIFTG = 0x03F003F003F003F0;
static const QWORD MASKR = 0xF800F800F800F800;

// constants used by the integer alpha blending equation
static const QWORD SIXTEEN = 0x0010001000100010;
static const QWORD FIVETWELVE = 0x0200020002000200;
static const QWORD SIXONES = 0x003F003F003F003F;

void ablend_565(
	LPWORD lpAlpha,UINT iAlpPitch, LPWORD lpSrc,UINT iSrcX, UINT iSrcY, UINT iSrcPitch,
	LPWORD lpDst, UINT iDstX, UINT iDstY, UINT iDstW, UINT iDstH, UINT iDstPitch)
{
	// Mask for isolating the red,green, and blue components
	LPWORD lpLinearDstBp = (iDstX << 1) + (iDstY * iDstPitch) + lpDst;	// base pointer for linear destination
	LPWORD lpLinearSrcBp = (iSrcX << 1) + (iSrcY * iSrcPitch) + lpSrc;	// base pointer for linear source
	LPWORD lpLinearAlpBp = iSrcX + (iSrcY * iAlpPitch) + lpAlpha;		// base pointer for linear alpha

	_asm
	{
		mov			esi, lpLinearSrcBp	// src
		mov			edi, lpLinearDstBp	// dst
		mov			eax, lpLinearAlpBp	// alpha
		mov			ecx, iDstH			// ecx = number of lines to copy
		mov			ebx, iDstW			// ebx = span width to copy
		test		esi, 6				// check if source address is qword aligned
		// since addr coming in is always word aligned(16bit)
		jnz			done				// if not qword aligned we don't do anything
primeloop:
		movd		mm1, [eax]			// mm1 = 00 00 00 00 a3 a2 a1 a0
		pxor		mm2, mm2			// mm2 = 0
		movq		mm4, [esi]			// g1: mm4 = src3 src2 src1 src0
		punpcklbw	mm1, mm2			// mm1 = 00a3 00a2 00a1 00a0
loopqword:
		mov			edx, [eax]
		test		ebx, 0xFFFFFFFC		// check if only 3 pixels left
		jz			checkback			// 3 or less pixels left
		// early out tests
		cmp			edx, 0xffffffff		// test for alpha value of 1
		je			copyback			// if 1's copy the source pixels to the destination
		test		edx, 0xffffffff		// test for alpha value of 0
		jz			leavefront			// if so go to the next 4 pixels
		// the alpha blend starts
		// green
		// i = a*sg+(63-a)*dg;
		// i = (i+32)+((i+32)>>6)>>6;
		// red
		// i = a*sr+(31-a)*dr;
		// i = (i+16)+((i+16)>>5)>>5;
		movq		mm5, [edi]			// g2: mm5 = dst3 dst2 dst1 dst0
		psrlw		mm1, 2				// mm1 = a?>>2 nuke out lower 2 bits
		movq		mm7, MASKSHIFTG		// g3: mm7 = 1 bit shifted green mask
		psrlw		mm4, 1				// g3a: move src green down by 1 so that we won't overflow
		movq		mm0, mm1			// mm0 = 00a3 00a2 00a1 00a0
		psrlw		mm5, 1				// g3b: move dst green down by 1 so that we won't overflow
		psrlw		mm1, 1				// mm1 = a?>>1 nuke out lower 1 bits
		pand		mm4, mm7			// g5: mm4 = sg3 sg2 sg1 sg0
		movq		mm2, SIXONES		// g4: mm2 = 63
		pand		mm5, mm7			// g7: mm5 = dg3 dg2 dg1 dg0
		movq		mm3, [esi]			// b1: mm3 = src3 src2 src1 src0
		psubsb		mm2, mm0			// g6: mm2 = 63-a3 63-a2 63-a1 63-a0
		movq		mm7, MASKB			// b2: mm7 = BLUE MASK
		pmullw		mm4, mm0			// g8: mm4 = sg?*a?
		movq		mm0, [edi]			// b3: mm0 = dst3 dst2 dst1 dst0
		pmullw		mm5, mm2			// g9: mm5 = dg?*(1-a?)
		movq		mm2, mm7			// b4: mm2 = fiveones
		pand		mm3, mm7			// b4: mm3 = sb3 sb2 sb1 sb0
		pmullw		mm3, mm1			// b6: mm3 = sb?*a?
		pand		mm0, mm7			// b5: mm0 = db3 db2 db1 db0
		movq		mm7, [esi]			// r1: mm7 = src3 src2 src1 src0
		paddw		mm4, mm5			// g10: mm4 = sg?*a?+dg?*(1-a?)
		pand		mm7, MASKR			// r2: mm7 = sr3 sr2 sr1 sr0　
		psubsb		mm2, mm1			// b5a: mm2 = 31-a3 31-a2 31-a1 31-a0
		paddw		mm4, FIVETWELVE		// g11: mm4 = (mm4+512) green
		pmullw		mm0, mm2			// b7: mm0 = db?*(1-a?)
		movq		mm5, mm4			// g12: mm5 = mm4 green
		psrlw		mm7, 11				// r4: shift src red down to position 0
		psrlw		mm4, 6				// g13: mm4 = mm4>>6
		paddw		mm4, mm5			// g14: mm4 = mm4+mm5 green
		paddw		mm0, mm3			// b8: mm0 = sb?*a?+db?*(1-a?)
		movq		mm5, [edi]			// r3: mm5 = dst3 dst2 dst1 dst0
		paddw		mm0, SIXTEEN		// b9: mm0 = (mm0+16) blue
		pand		mm5, MASKR			// r5: mm5 = dr3 dr2 dr1 dr0
		psrlw		mm4, 5				// g15: mm4 = 0?g0 0?g0 0?g0 0?g0 green
		movq		mm3, mm0			// b10: mm3 = mm0 blue
		psrlw		mm0, 5				// b11: mm0 = mm0>>5 blue
		psrlw		mm5, 11				// r6: shift dst red down to position 0
		paddw		mm0, mm3			// b12: mm0 = mm3+mm0 blue
		psrlw		mm0, 5				// b13: mm0 = 000b 000b 000b 000b blue
		pmullw		mm7, mm1			// mm7 = sr?*a?
		pand		mm4, MASKG			// g16: mm4 = 00g0 00g0 00g0 00g0 green
		pmullw		mm5, mm2			// r7: mm5 = dr?*(31-a?)
		por			mm0, mm4;			// mm0 = 00gb 00gb 00gb 00gb
		add			eax, 4				// move to next 4 alphas
		add			esi, 8				// move to next 4 pixels in src
		add			edi, 8				// move to next 4 pixels in dst
		movd		mm1, [eax]			// mm1 = 00 00 00 00 a3 a2 a1 a0
		paddw		mm5, mm7			// r8: mm5 = sr?*a?+dr?*(31-a?)
		paddw		mm5, SIXTEEN		// r9: mm5 = (mm5+16) red
		pxor		mm2, mm2			// mm2 = 0
		movq		mm7, mm5			// r10: mm7 = mm5 red
		psrlw		mm5, 5				// r11: mm5 = mm5>>5 red
		movq		mm4, [esi]			// g1: mm4 = src3 src2 src1 src0
		paddw		mm5, mm7			// r12: mm5 = mm7+mm5 red
		punpcklbw	mm1, mm2			// mm1 = 00a3 00a2 00a1 00a0
		psrlw		mm5, 5				// r13: mm5 = mm5>>5 red
		psllw		mm5, 11				// r14: mm5 = mm5<<10 red
		por			mm0, mm5			// mm0 = 0rgb 0rgb 0rgb 0rgb
		sub			ebx, 4				// polished off 4 pixels
		movq		[edi-8], mm0		// dst = 0rgb 0rgb 0rgb 0rgb
		jmp			loopqword			// go back to start
copyback:
		movq		[edi], mm4			// copy source to destination
leavefront:
		add			edi, 8				// advance destination by 4 pixels
		add			eax, 4				// advance alpha by 4
		add			esi, 8				// advance source by 4 pixels
		sub			ebx, 4				// decrease pixel count by 4
		jmp			primeloop
checkback:
		test		ebx, 0xFF			//check if 0 pixels left
		jz			nextline			// done with this span
//backalign:							// work out back end pixels
		movq		mm5, [edi] //g2: mm5=dst3 dst2 dst1 dst0
		psrlw		mm1, 2 //mm1=a?>>2 nuke out lower 2 bits
		movq		mm7, MASKSHIFTG; //g3: mm7=shift 1 bit green mask
		psrlw		mm4, 1 //g3a: move src green down by 1 so that we won't overflow
		movq		mm0, mm1 //mm0=00a3 00a2 00a1 00a0
		psrlw		mm5, 1 //g3b: move dst green down by 1 so that we won't overflow
		psrlw		mm1, 1 //mm1=a?>>1 nuke out lower 1 bits
		pand		mm4, mm7 //g5: mm4=sg3 sg2 sg1 sg0
		movq		mm2, SIXONES//g4: mm2=63
		pand		mm5, mm7 //g7: mm5=dg3 dg2 dg1 dg0
		movq		mm3, [esi] //b1: mm3=src3 src2 src1 src0
		psubsb		mm2, mm0 //g6: mm2=63-a3 63-a2 63-a1 63-a0
		movq		mm7, MASKB //b2: mm7=BLUE MASK
		pmullw		mm4, mm0 //g8: mm4=sg?*a?
		movq		mm0, [edi] //b3: mm0=dst3 dst2 dst1 dst0
		pmullw		mm5, mm2 //g9: mm5=dg?*(1-a?)
		movq		mm2, mm7 //b4: mm2=fiveones
		pand		mm3, mm7 //b4: mm3=sr3 sr2 sr1 sr0
		pmullw		mm3, mm1 //b6: mm3=sb?*a?
		pand		mm0, mm7 //b5: mm0=db3 db2 db1 db0
		movq		mm7, [esi] //r1: mm7=src3 src2 src1 src0
		paddw		mm4, mm5 //g10: mm4=sg?*a?+dg?*(1-a?)
		pand		mm7, MASKR //r2: mm7=sr3 sr2 sr1 sr0
		psubsb		mm2, mm1 //b5a: mm2=31-a3 31-a2 31-a1 31-a0
		paddw		mm4, FIVETWELVE //g11: mm4=(i+512) green
		pmullw		mm0, mm2 //b7: mm0=db?*(1-a?)
		movq		mm5, mm4 //g12: mm5=(i+512) green
		psrlw		mm7, 11 //r4: shift src red down to position 0
		psrlw		mm4, 6 //g13: mm4=(i+512)>>6
		paddw		mm4, mm5 //g14: mm4=(i+512)+((i+512)>>6) green
		paddw		mm0, mm3 //b8: mm0=sb?*a?+db?*(1-a?)
		movq		mm5, [edi] //r3: mm5=dst3 dst2 dst1 dst0
		paddw		mm0, SIXTEEN //b9: mm0=(i+16) blue
		pand		mm5, MASKR //r5: mm5=dr3 dr2 dr1 dr0
		psrlw		mm4, 5 //g15: mm4=0?g0 0?g0 0?g0 0?g0 green
		movq		mm3, mm0 //b10: mm3=(i+16) blue
		psrlw		mm0, 5 //b11: mm0=(i+16)>>5 blue
		psrlw		mm5, 11 //r6: shift dst red down to position 0
		paddw		mm0, mm3 //b12: mm0=(i+16)+(i+16)>>5 blue
		psrlw		mm0, 5 //b13: mm0=000r 000r 000r 000r blue
		pmullw		mm7, mm1 //mm7=sr?*a?
		pand		mm4, MASKG //g16: mm4=00g0 00g0 00g0 00g0 green
		pmullw		mm5, mm2 //r7: mm5=dr?*(31-a?)
		por			mm0, mm4 //mm0=00gb 00gb 00gb 00gb
		add			eax, 4 //move to next 4 alphas
		//stall
		paddw		mm5, mm7 //r8: mm5=sr?*a?+dr?*(31-a?)
		paddw		mm5, SIXTEEN //r9: mm5=(i+16) red
		movq		mm7, mm5 //r10: mm7=(i+16) red
		psrlw		mm5, 5 //r11: mm5=(i+16)>>5 red
		paddw		mm5, mm7 //r12: mm5=(i+16)+((i+16)>>5) red
		psrlw		mm5, 5 //r13: mm5=(i+16)+((i+16)>>5)>>5 red
		psllw		mm5, 11 //r14: mm5=mm5<<10 red
		por			mm0, mm5 //mm0=0rgb 0rgb 0rgb 0rgb
		test		ebx, 2 //check if there are 2 pixels
		jz			oneendpixel //goto one pixel if that's it
		movd		[edi], mm0 //dst=0000 0000 0rgb 0rgb
		psrlq		mm0, 32 //mm0>>32
		add			edi, 4 //edi=edi+4
		sub			ebx, 2 //saved 2 pixels
		jz			nextline //all done goto next line
oneendpixel:					//work on last pixel
		movd		edx, mm0 //edx=0rgb
		mov			[edi], dx //dst=0rgb
nextline: //goto next line
		dec			ecx //nuke one line
		jz			done //all done
		mov			eax, lpLinearAlpBp //alpha
		mov			esi, lpLinearSrcBp //src
		mov			edi, lpLinearDstBp //dst
		add			eax, iAlpPitch //inc alpha ptr by 1 line
		add			esi, iSrcPitch //inc src ptr by 1 line
		add			edi, iDstPitch //inc dst ptr by 1 line
		mov			lpLinearAlpBp, eax //save new alpha base ptr
		mov			ebx, iDstW //ebx=span width to copy
		mov			lpLinearSrcBp, esi //save new src base ptr
		mov			lpLinearDstBp, edi //save new dst base ptr
		jmp			primeloop //start the next span
done:
		emms
	}
}
